## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## locfit 1.5-9.1    2013-03-22
## Loading required package: splines
## Loading required package: Matrix
## 
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
## 
##     matplot
## 
## Attaching package: 'refund'
## The following object is masked from 'package:locfit':
## 
##     lf
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## This version of Shiny is designed to work with 'htmlwidgets' >= 1.5.
##     Please upgrade via install.packages('htmlwidgets').
# #assuming tab separated values with a header
# filelist <- list.files(pattern = ".*.dly")
# datalist <- lapply(filelist, function(x)read.dly(x))

# Read Multiple dly files
loclist <- c("Redwood", "Davis", "Ames")
tickerlist <- c( "TMIN.VALUE", "TMAX.VALUE", "SPREAD")
for (loc in loclist){
  exec <- paste0("dat.", loc, " <- read.dly(\"", paste0(loc, ".dly\""), ")")
  eval(parse(text = exec))
}

for(loc in loclist){
    eval(parse(text = paste0("dat <- dat." ,loc)))
    dat <- subset(dat,YEAR!= 2019) # do not consider the latest year, due to missing data for the rest of the year
    dates <- data.frame(y  = dat$YEAR, m = dat$MONTH, d = dat$DAY)
    dat$dates <- dates
    ndays <- nrow(subset(dat$dates, dates$y == y[1])) #365 for each year
    days <- rep(seq(ndays), length(unique(dat$YEAR) )) 
    dat$days <- days
    
    for(ticker in tickerlist){
        # summary(dat)
        df <- data.frame(t = days, id = dat$YEAR, y =dat[[ticker]]) # %>% na.omit 
        df$no <- 1:nrow(df)
        denseSamp <- MakeFPCAInputs(IDs=as.vector(df$id), tVec=df$t, yVec=df$y) 
        eval(parse(text = paste0("df.",loc, ".",ticker,"<- df") ) )
        eval(parse(text = paste0("denseSamp.",loc, ".",ticker,"<- denseSamp") ) )
    }
}
ylim <- c(-10,40) 
par(mfrow = c(1,2))
## Some preliminary data analysis
#### A peak of original data
ticker <- "TMAX.VALUE"
loc <- "Davis"
for( loc in loclist){
  for( ticker in tickerlist){
    eval(parse(text = paste0("df <- df.",loc, ".",ticker) ))
    plot(y ~ t , df , type = "p", lty = 1, ylim = ylim, main = paste0("Averaged daily ", ticker,  " of ", loc))
    
    
    #### See which days have same observation for as the previous day
    diff <- abs(df$y[-1]  - df$y[1:(nrow(df)-1)])
    strangeInd <- which( diff <.001)
    cbind(df$y[strangeInd],df$y[strangeInd+1])
    table(df$id[strangeInd])
    
    #### A peak of averaged daily max temperature
    mu.max <- matrix(df$y, ncol = ndays, byrow = TRUE) %>% colMeans(na.rm = TRUE)
    lines(mu.max~ seq(ndays), type = "l", col = 2)
  }
}  

Mean function

## Mean estimation
#### Smoothing using Epanechnikov Kernel
hmu  <- 49
ngrid <- 365
col <- 1: length(loclist)
ylim <- c(0,40) 

png("pre-smooth.png",width = 2000, height = 1000, units = "px", pointsize = 30)
par(mfrow = c(1,3))
for( ticker in tickerlist){
    for( loc in loclist){
    eval(parse(text = paste0("df <- df.",loc, ".",ticker) ))
    df <- df %>% na.omit 
    resMu <- locfit (y ~ lp (t, h  = hmu, deg = 3 ), df, ev = lfgrid(mg = ngrid), kern = 'epan')
    tGrid <-  seq(min(df$t), max(df$t), length.out = ngrid)
    muHat <- predict (resMu)
    if(loc == loclist[1])
      plot(tGrid, muHat, col = 1, lwd = 2, lty = 1, ylim = ylim, type = "l",xlab = "days", ylab = paste0("Smoothed daily  ", ticker))  # main = paste0("The smoothed estimation of Mean Daily ", ticker)
   else
      lines(tGrid, muHat, col = 2, lwd = 2, lty = 2, ylim = ylim, xlab = "days", ylab = "Smoothed daily mean")
    }
  legend("topleft",loclist, lty = 1:length(loclist) , lwd = 2 , col = 1:2)
}
dev.off()
## quartz_off_screen 
##                 2
## Covariance & FPCs Estimation
hmu  <- 49
ngrid <- 365
col <- 1: length(loclist)
ylim <- c(0,40) 
## Dataframe stacking
#### Note the time for y_2 is shifted, otherwise error of duplicate time t would occur
#### Also note the shift in time t would not affect the estimation of mean function, covariance function or eigen function. 
#### It would only affect the FPC scores.
df1 <- df.Davis.TMIN.VALUE %>% na.omit()
df2 <- df.Davis.TMAX.VALUE %>% na.omit()
#### Data Manipulation: truncated log transform (since there are temperature zero observations)
# df1$y <- log(df1$y + .001) 
# df2$y <- log(df2$y + .001) 
df2$t <- df2$t+diff(range(df2$t))+1
df.min.max <- rbind(df1, df2)%>% na.omit()
df <- df.min.max
resMu <- locfit(y ~ lp(t, deg=1), df, ev=lfgrid(mg=ngrid)) 
tGrid <- seq(min(df$t), max(df$t), length.out=ngrid)  # working grid
samp <- MakeFPCAInputs(df$id, df$t, df$y) 
res <- FPCA(samp$Ly, samp$Lt, list(userBwMu=hmu, userBwCov=hmu, FVEthreshold=1, dataType  = 'Dense')) 
# plot(res) 
matplot (res$phi[,1:3], type = "l", main = "First 3 PCs without Pre-smoothing")

plot(res$cumFVE, type = "l", xlab = "Number of PCs" , ylab ="Variance explained", main = "Variance explained without Pre-smoothing")

x1 =1:ndays
x2 = (ndays+1):(2*ndays)
cov = res$fittedCov[x1,x2]
persp(x =1:ndays , y = 1:ndays,  z = res$fittedCov[x1,x2])

plotdf <-list(x1 =1:ndays , x2 = 1:ndays,  cov = res$fittedCov[x1,x2])
# plotdf <-data.frame(res$fittedCov)
# qplot(plotdf, aes(x1= x1, x2 = x2 )) + geom_line() + facet_wrap(~K, scales='free_y')

p <- plot_ly(z = ~plotdf$cov, x=~plotdf$x1,  y=~plotdf$x1) %>% add_surface()
p  <- layout(p,              
                title = paste0("Covariance estimation for ", loc, ticker),
                scene = list(xaxis = list(title = 't'),
                    yaxis = list(title = 's'),
                    zaxis = list(title = 'cov(t,s)', range = c(-1,8))))

p

The roughness of the mean function and covariance function stem from the noise in the original data. The paper proposed to smooth the origianl data to deal with this problem.

In addition, to alleviate the roughness on the edges between each segment, the author smoothed the data with extended segments at each ends of the segment, and evaluate only on the support of each segment. That means, the data of the first and last observations of each segment is used twice to provide information of the edge behaviour between two consective segments. The size of the overlapped segment is proposed to be \(2bq\) where \(b\) is the bandwidth

Smoothing of original temperature curve

Note splitting the dataframe is needed since two large dataframe can drive computational issue.

#### WARNING: This chunk takes a long time  (~2 min)

## OverlapSplit 
## https://stackoverflow.com/questions/5653756/split-a-data-frame-into-overlapping-dataframes
OverlapSplit <- function(x,nsplit=1,overlap=2){
    nrows <- NROW(x)
    nperdf <- ceiling( (nrows + overlap*nsplit) / (nsplit+1) )
    start <- seq(1, nsplit*(nperdf-overlap)+1, by= nperdf-overlap )
    fullind <- lapply(start, function(i) c(i:(i+nperdf-1)))
    if( start[nsplit+1] + nperdf != nrows )
        warning("Returning an incomplete dataframe.")
    splitted <- lapply(start, function(i) x[c(i:(i+nperdf-1)),])
    return(list(splitted  =splitted , start = start, fullind = fullind ))
}


ticker <- tickerlist[1] #"TMAX.VALUE"
loc <- loclist[1]# "Davis"
nsplit <- 4
nseg <- nsplit + 1
hmu <- 49

for( loc in loclist){
  for( ticker in tickerlist){
    
  eval(parse(text = paste0("df <- df.",loc, ".",ticker) ))
  df.split <-OverlapSplit(df, nsplit = nsplit, overlap = 2* hmu * nseg)
  fullind <- df.split$fullind
  smoothed <- c()
   
    for( i in 1: nseg){
      # i = 1
      # i = nseg
      df.i <- df.split$splitted[[i]]
      nrow <- nrow(df.i)
      if( i == 1 ){
        evaind <- 1:(nrow - hmu * nseg)
      }else
      if(i == nseg){
        evaind <- (hmu * nseg  + 1): nrow
      }else 
        evaind <- (hmu * nseg  + 1): (nrow - hmu * nseg)
      
      # Deal with years with no data
      if( all(is.na(df.i$y))) {
        df.i $ smoothed <- rep(NA, nrow)
      } else{
      resMu <- locfit(y ~ lp(1:nrow,  deg = 3, h = hmu ), df.i , kern  = "epan", ev = 1:nrow)
      df.i $ smoothed <- predict(resMu)
      }
      
      df.i <- df.i[evaind,]
      
      # store the smoothed data
      smoothed <- rbind(smoothed, df.i)
      eval(parse(text = paste0("dfs.",loc, ".",ticker,".", i," <- df.i") ))
      cat(i, "-th segment has been smoothed.\n")
    }
    
  eval(parse(text = paste0("dfs.",loc, ".",ticker," <- smoothed") ))
  }
}    

compare <- dfs.Davis.SPREAD [c("y", "smoothed")] # %>% na.omit()# [1:ndays,]
matplot(compare%>% na.omit(), type = "l" , main = "Original data VS smoothed data of Davis Daily Maximum temperature")
matplot(compare[1:ndays,], type = "l" , main = "Original data VS smoothed data of Davis Daily SPREAD in 1893")

# save the workspace
# save.image("smoothed.RData")
load("smoothed.RData") 

Remark: special attention should be paid to the smoothed curve here. Although the smoothing enables us to estimate a complete curve for a certain year when partial data is missing, we should not use the complete curve, since interpolation can give extreme data.

for(ticker in tickerlist){
  for(loc in loclist){
    eval(parse(text   = paste0("df <- dfs.",loc, ".",ticker)))
    df <- df[!is.na(df$y), ]
    resMu <- locfit(smoothed ~ lp(t, deg=1, h=hmu), df, ev=lfgrid(mg=ngrid)) 
    tGrid <- seq(min(df$t), max(df$t), length.out=ngrid)  # working grid
    samp.s <- MakeFPCAInputs(df$id, df$t, df$smoothed) 
    res.s <- FPCA(samp.s$Ly, samp.s$Lt, list(userBwMu=hmu, userBwCov=hmu, FVEthreshold=1, dataType  = 'Dense')) 
    eval(parse(text   = paste0("dfs.",loc, ".",ticker, ".res <- res.s")))
  }

}
# par(mfrow = c(1,3))

for( ticker in tickerlist){
    for( loc in loclist){
        eval(parse(text = paste0("res.s <- dfs.",loc, ".",ticker,".res") ))
      
        ## Compare Covariance Matrix
        cov.s = res.s$fittedCov
        persp(x =1:ndays , y = 1:ndays,  z = cov.s)
        plotdf.s <-list(x1 =1:ndays , x2 = 1:ndays,  cov = cov.s)

        #### widgets
        # p.s <- plot_ly(z = ~plotdf.s$cov, x=~plotdf.s$x1,  y=~plotdf.s$x2) %>% add_surface(text =c( "s",  "t",  "cov(s,t)"))
        # p.s  <- layout(p.s,              
                    # title = paste0("Covariance estimation for ", loc, ticker),
                    # scene = list(xaxis = list(title = 't'),
                    #     yaxis = list(title = 's'),
                    #     zaxis = list(title = 'cov(t,s)', range = c(-1,8))))
        # eval(parse(text   = paste0("p.s.",loc, ".",ticker, "<- p.s")))
        # orca(p.s, paste0("p.s.",loc, ".",ticker, ".png"))
        # htmlwidgets::saveWidget(p.s, paste0("p.s.",loc, ".",ticker, ".html"))
        
        #### 2d-heatmap
        # png(file = paste0("p.s.",loc, ".",ticker, ".cov.png") ,width = 2000, height = 1000, units = "px", pointsize = 30)
        p.s <- plot_ly(z = ~plotdf.s$cov, x=~plotdf.s$x1,  y=~plotdf.s$x2, type = "heatmap", colorbar = list(title = "cov(t,s)") )
        p.s  <- layout(p.s,              
                    title = paste0("Covariance estimation for ", loc, ticker),
                    xaxis = list(title = "t"),
                    yaxis = list(title = "s")
                    )
        p.s
        # orca(p.s, file = paste0("p.s.",loc, ".",ticker, ".cov.png") )
        # dev.off()
    }
}

## FPC1 Comparison (phi1)
# png("fpc1.png",width = 2000, height = 1000, units = "px", pointsize = 30)
par(mfrow = c(1,3))
for( ticker in tickerlist){
    for( loc in loclist){
        eval(parse(text = paste0("res.s <- dfs.",loc, ".",ticker,".res") ))
        cov.s = res.s$fittedCov
        plotdf.s <-list(x1 =1:ndays , x2 = 1:ndays,  cov = cov.s)
    if(loc == loclist[1])
      plot(1:ndays, res.s$phi[,1], col = 1, lwd = 2, lty = 1,ylim = c(-.01, .1), type = "l",xlab = "days", ylab = paste0("First PFC", ticker))  # main = paste0("The smoothed estimation of Mean Daily ", ticker)
      else
      lines(1:ndays,res.s$phi[,1], col = 2, lwd = 2, lty = 2, ylim = c(-.01, .1), xlab = "days")
    }
  legend("topleft",loclist, lty = 1:length(loclist) , lwd = 2 , col = 1:2)
  
}

dev.off()
## null device 
##           1
## FPC2 Comparison (phi2)
# png("fpc2.png",width = 2000, height = 1000, units = "px", pointsize = 30)
par(mfrow = c(1,3))
for( ticker in tickerlist){
    for( loc in loclist){
        eval(parse(text = paste0("res.s <- dfs.",loc, ".",ticker,".res") ))
        cov.s = res.s$fittedCov
        plotdf.s <-list(x1 =1:ndays , x2 = 1:ndays,  cov = cov.s)
    if(loc == loclist[1])
      plot(1:ndays, res.s$phi[,2], col = 1, lwd = 2, lty = 1,ylim = c(-.15, .15), type = "l",xlab = "days", ylab = paste0("Second PFC", ticker))  # main = paste0("The smoothed estimation of Mean Daily ", ticker)
      else
      lines(1:ndays,res.s$phi[,2], col = 2, lwd = 2, lty = 2, ylim = c(-.15, .15), xlab = "days")
    }
  legend("topleft",loclist, lty = 1:length(loclist) , lwd = 2 , col = 1:2)
}
dev.off()
## null device 
##           1
## FPC1 scores (xi1)
# png("xi1.png",width = 2000, height = 1000, units = "px", pointsize = 30)
par(mfrow = c(1,3))
t.res <- c()
power.res <- c()
for( ticker in tickerlist){
    for( loc in loclist){
        eval(parse(text = paste0("res.s <- dfs.",loc, ".",ticker,".res") ))
        xi.s <- res.s$xiEst
        years <- as.vector(as.numeric(names(res.s$inputData$Lt)))
        if(loc == loclist[1]){
           plot(years, xi.s[,1], col = 1, lwd = 2, lty = 1, type = "p",xlab = "days", ylab = paste0("xi1", ticker))  # main = paste0("The smoothed estimation of Mean Daily ", ticker)
        reg1 <- (lm(xi.s[,1] ~ years))
        abline(reg1$coefficients, col = 1, lwd = 2, lty = 1, ylim = c(-.01, .1), xlab = "days")
        }else{
          points(years, xi.s[,1], col = 2, lwd = 2, lty = 2)  # main = paste0("The smoothed estimation of Mean Daily ", ticker)
          reg2 <- (lm(xi.s[,1] ~ years))
          abline(reg2$coefficients, col = 2, lwd = 2, lty = 2, ylim = c(-.01, .1), xlab = "days")
        }
        
    }
  ## Store the t-statistics and powers
  t.res <- cbind(t.res, c(summary(reg1)$coeff[2,3],summary(reg2)$coeff[2,3]))
  power.res <- cbind(power.res, c(summary(reg1)$coeff[2,4],summary(reg2)$coeff[2,4]))
  legend("topleft",loclist, lty = 1:length(loclist) , lwd = 2 , col = 1:2)
  
}


dev.off()
## null device 
##           1
t.res
##          [,1]       [,2]      [,3]
## [1,] 6.445655 -0.3390321 -5.943052
## [2,] 5.777814 -2.2537011 -7.819670
power.res
##              [,1]       [,2]         [,3]
## [1,] 6.186967e-09 0.73540248 5.653790e-08
## [2,] 6.684611e-08 0.02612685 2.910134e-12
# matplot (res.s$phi[,1:3], type = "l", main = "First 3 PCs with Pre-smoothing")
# plot(res.s$cumFVE, type = "l", xlab = "Number of PCs" , ylab ="Variance explained", main = paste0("Covariance estimation for ", loc, ticker))
## Covariance & FPCs Estimation
hmu  <- 49
ngrid <- 365
col <- 1: length(loclist)
ylim <- c(0,40) 
## Dataframe stacking
#### Note the time for y_2 is shifted, otherwise error of duplicate time t would occur
#### Also note the shift in time t would not affect the estimation of mean function, covariance function or eigen function. 
#### It would only affect the FPC scores.
df1 <- dfs.Redwood.TMIN.VALUE 
df2 <- dfs.Redwood.TMAX.VALUE 
plot(df1$smoothed,type = "l")

plot(df1[!is.na(df1$y), ]$smoothed, type = "l") # interpolation can give extreme data

df2$t <- df2$t+ndays
dfs.min.max <- rbind(df1, df2)
dfs.min.max$logt <- log(dfs.min.max$t) 
df <- dfs.min.max [!is.na(dfs.min.max $y), ]
plot(df$y, type = "l")

plot(df$smoothed, type = "l")

resMu <- locfit(smoothed ~ lp(t, deg=1, h=hmu), df, ev=lfgrid(mg=ngrid)) 
tGrid <- seq(min(df$t), max(df$t), length.out=ngrid)  # working grid
samp.s <- MakeFPCAInputs(df$id, df$t, df$smoothed) 
res.s <- FPCA(samp.s$Ly, samp.s$Lt, list(userBwMu=hmu, userBwCov=hmu, FVEthreshold=1, dataType  = 'Dense')) 
matplot (res.s$phi[,1:3], type = "l", main = "First 3 PCs with Pre-smoothing")

plot(res.s$cumFVE, type = "l", xlab = "Number of PCs" , ylab ="Variance explained", main = "Variance explained with Pre-smoothing")

## Covariance between min and max
x1 =1:ndays
x2 = (ndays+1):(2*ndays)
cov.s = res.s$fittedCov[x1,x2]
persp(x =1:ndays , y = 1:ndays,  z = cov.s)

plotdf.s <-list(x1 =1:ndays , x2 = 1:ndays,  cov = res.s$fittedCov[x1,x2])
p.s <- plot_ly(z = ~plotdf.s$cov, x=~plotdf.s$x1,  y=~plotdf.s$x2) %>% add_surface( ylab = "s",  zlab ="beta1", theta = -30, phi = 30)
p.s
## Warning: 'surface' objects don't have these attributes: 'ylab', 'zlab', 'theta', 'phi'
## Valid attributes include:
## 'type', 'visible', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'uirevision', 'z', 'x', 'y', 'text', 'hovertext', 'hovertemplate', 'connectgaps', 'surfacecolor', 'cauto', 'cmin', 'cmax', 'cmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'contours', 'hidesurface', 'lightposition', 'lighting', 'opacity', '_deprecated', 'hoverinfo', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'zsrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'surfacecolorsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

Functional Variance Process

In this report, functional variance process (FVP) is introduced to extract the information in the stochastic time-trends in the noise variance of functional data.

Consider \(Y_1, \ldots, Y_n\) to be \(n\) continuous smooth random functions defined over a real interval \([0,T]\), Here we also assume that these functions are observed at a grid of dense time-points \(t_j = \frac{j-1}{m-1} T, j = 1, \ldots, n\) with measurements \[Y_{ij} = Y_i (t_j) + R_{ij}, \quad i = 1, \ldots, n , \quad j = 1, \ldots, m,\] where \(R_{ij}\) are additive noises such that \(E(R_{ij} R_{i'j}) = 0\) for \(i \neq i'\) and \(E(R) = 0\) and \(E(R^2) <\infty\). Also we model the noise into two parts: \(V(t_{ij})\) from the underlying smooth functional variance process, \(W(t_{ij})\) is a noise component. We also assume these two component satisfies \[R_{ij}^2 = \exp(V(t_{ij}) \exp(W(t_{ij}) \] Perform change of variables, we have \[Z_{ij} : = \log(R_{ij}^2) = V(t_{ij} + W(t_{ij} \] \[E(Z_{ij}) = E(V(t_{ij}) =: \mu_V(t_{ij}) \] \[Cov(Z_{ij},Z_{i'j}) =Cov(V(t_{ij}),V(t_{i'j})) \]

Further assume mean and autocovariance of \(V(t)\) as follows \[E(V(t))= \mu_V (t) \] \[C_{VV} (s,t) = \sum_{k = 1}^\infty \rho_k \Psi_k(s ) \Psi_k(t)\] where \(\rho_1\geq \rho_2 \geq \ldots, 0\) are non-negative ordered eigen values, \(\Psi_k\) being the corresponding orthonormal eigen functions of \(V\).